Preamble

knitr::opts_chunk$set(warning = FALSE, message = FALSE,  fig.path = "figures/", fig.height=6.51,fig.width=8.49, dev='png', fig.process = function(filename) {
    new_filename <- stringr::str_remove(string = filename,
                                        pattern = "-1")
    fs::file_move(path = filename, new_path = new_filename)
    ifelse(fs::file_exists(new_filename), new_filename, filename)
  })
set.seed(29106799)
libs <- c("ggplot2", "pracma", "glmnet", "extRemes", "foreach", "doParallel", "gridExtra", "pROC", "MLmetrics", "energy")
sapply(libs,require,character.only = T, quietly = TRUE, warn.conflicts = FALSE)
##    ggplot2     pracma     glmnet   extRemes    foreach doParallel 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE 
##  gridExtra       pROC  MLmetrics     energy 
##       TRUE       TRUE       TRUE       TRUE
require(knitr,quietly=TRUE,warn.conflicts = FALSE)
knit("PredictEpidemicTools.Rmd",quiet = TRUE)
## [1] "PredictEpidemicTools.md"
detectCores()
## [1] 8
cl <- makeCluster(7) #set to number of cores -1

registerDoParallel(cl)

getDoParWorkers()
## [1] 7

Section 2 : The ILI data

Preparation of the data

Import the data

incidences <- read.csv("ILIincidences1985-2019.csv", sep = ";")

incidences          <- incidences[!is.na(incidences$t_inc),]
incidences$t_inc    <-
as.numeric(levels(incidences$t_inc))[incidences$t_inc]
incidences$yearweek <- as.numeric(incidences$yearweek)
dates               <-
seq(as.Date("1985-01-01") , as.Date("2019-03-10"), by = "weeks")
incidences$dates    <- dates

incidences$weeknum <-
seq(1, length = length(incidences$year), by = 1)

incidences <-
incidences[-228, ] # Missing data on the original file.

Sentinelles epidemics

debut_sentinelles <- c()
fin_sentinelles <- c()

for (i in unique(incidences$season)) {
  temp <-
    incidences[(incidences$epid == 1) &
                     (incidences$season == i),]$weeknum
  debut_sentinelles <- c(debut_sentinelles, min(temp))
  fin_sentinelles <- c(fin_sentinelles, max(temp))
}

l.epid_serf <- fin_sentinelles - debut_sentinelles + 1
longest <- max(l.epid_serf)
shortest <- min(l.epid_serf)

year_longest <- unique(incidences$season)[which.max(l.epid_serf)]
year_shortest <- unique(incidences$season)[which.min(l.epid_serf)]

In the Serfling definition of an epidemic, the shortest epidemic between 1985 and 2019 was 5 weeks in 1991 and the longest 16 in 2010 .

d1 <- 1
d2 <- longest
sent_epid_data <-
  EpidemicDataFrameSentinelles(data = incidences, d1 = d1, d2 = d2)

peaks <- c()

for (i in unique(sent_epid_data$season)) {
  peaks <- c(peaks, max(sent_epid_data[sent_epid_data$season == i, ]$t_inc, na.rm = T))
}

highest_peaks <- max(peaks)
lowest_peaks <- min(peaks)

year_highest <- unique(incidences$season)[which.max(peaks)]
year_lowest <- unique(incidences$season)[which.min(peaks)]

In the Serfling definition of an epidemic, the value of the highest epidemic peak between 1985 and 2019 was 1793 in 1989 and the value of the lowest 325 in 325 .

Figure 1

x.plot <-
  ggplot(data = incidences, aes(x = dates, y = t_inc)) + geom_line(size = 0.3)
  x.plot <-
  x.plot + xlab("Years") + ylab("Incidence rates")
  x.plot <-
  x.plot + scale_x_date(date_breaks = "3 years", date_labels = "%Y")
 x.plot <- x.plot + geom_segment(aes(xend = as.Date("1989-01-01"), yend = 1850, x = as.Date("1989-01-01"), y = 1940),
    arrow = arrow(length = unit(0.1,"cm")), col ='blue') 
  x.plot <- x.plot + geom_segment(aes(xend = as.Date("2014-02-01"), yend = 375, x = as.Date("2014-02-01"), y = 480),
    arrow = arrow(length = unit(0.1,"cm")), col ='blue') 
  x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 14),
  axis.text.x = element_text(size = 10),
  axis.text.y = element_text(size = 10),
  plot.title = element_text(size = 14)
  )
  x.plot

1985-2018 ILI data

Put the 2019 epidemic aside save it for a test case

incidences2018 <- subset(x = incidences, subset = (season < 2019))
nb.epid <-
  length(unique(incidences2018$season)) ## number of epidemics between 1985 and 2019

There are 34 epidemics between 1985 and 2018.

Detection of epidemics with the method proposed in the paper

Define epidemic threshold

level_epid <- 0.88
thres_epid <-
  quantile(incidences2018$t_inc, probs = level_epid, na.rm = T)

The threshold for epidemic definition is chosen as the 0.88 and is equal to 272.44.

Create a new data frame containing the epidemic periods

d <- longest
epid_data_thres<-
  EpidemicDataFrame(data = incidences2018, d = d, thres = thres_epid)

Durations of the epidemic periods

l.epid <- c()

for (i in unique(epid_data_thres$season)) {
  l.epid <-
    c(l.epid, length(epid_data_thres[(epid_data_thres$season == i) &
                                       (!is.na(epid_data_thres$t_inc)),]$t_inc))
}

nb.epid <- length(l.epid)

longest_thres <- max(l.epid)
shortest_thres <- min(l.epid)

year_longest_thres <- unique(incidences$season)[which.max(l.epid)]
year_shortest_thres <- unique(incidences$season)[which.min(l.epid)]

In our definition of an epidemic, there are 34 epidemics, and the shortest epidemic between 1985 and 2019 was 3 weeks in 2014 and the longest 12 in 1985.

Peaks of epidemic periods

d <- longest_thres
epid_data_thres <-
  EpidemicDataFrame(data = incidences2018, d = d, thres = thres_epid)

peaks_thres <- c()
for (i in unique(epid_data_thres$season)) {
  peaks_thres <- c(peaks_thres, max(epid_data_thres[epid_data_thres$season == i, ]$t_inc, na.rm = T))
}

highest_peaks_thres <- max(peaks_thres)
lowest_peaks_thres <- min(peaks_thres)

year_highest_thres <- unique(incidences$season)[which.max(peaks_thres)]
year_lowest_thres <- unique(incidences$season)[which.min(peaks_thres)]

In the new defintion of an epidemic, the value of the highest epidemic peak between 1985 and 2019 was 1793 in 1989 and the value of the lowest 325 in 325 .

Figure 2

Figure 2.a)

d1 <- 1
d2 <- longest
sent_epid_data <-
  EpidemicDataFrameSentinelles(data = incidences2018, d1 = d1, d2 = d2)
df.x <- sent_epid_data
x.plot <- ggplot(data = df.x, aes(x = 1:d2, y = t_inc))
for (i in 1:nb.epid) {
  x.plot <-
    x.plot + geom_line(data = df.x[df.x$season == 1984 + i,], aes(x = weeks, y = t_inc), size =
                         0.3)
}
x.plot <- x.plot + scale_x_continuous(breaks = seq(1, 16, 1))
x.plot <-
  x.plot + xlab("Epidemic weeks") + ylab("Incidence rates")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot

Figure 2.b)

df.x <- subset(epid_data_thres, weeks < d + 1)
x.plot <- ggplot(data = df.x, aes(x = 1:d, y = t_inc))
for (i in 1:nb.epid) {
  x.plot <-
    x.plot + geom_line(data = df.x[df.x$season == 1984 + i,], aes(x = weeks, y = t_inc), size =
                         0.3)
}
x.plot <- x.plot + scale_x_continuous(breaks = seq(1, 16, 1), labels = seq(1, 16, 1)) + expand_limits(x=c(0,16))
x.plot <-
  x.plot + xlab("Epidemic weeks") + ylab("Incidence rates")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot

Construction of the data frame with the threshold excesses of the first 3 weeks

d <- 3
epid_data <-
  EpidemicDataFrame(data = incidences2018, d = d, thres = thres_epid)

Correlation plots for ILI rates

acf.temp <- acf(x = epid_data[epid_data$weeks == 1,]$t_inc,lag.max = 10, plot = F)
acf.week1 <- data.frame(lag = acf.temp$lag, acf = acf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(epid_data[epid_data$weeks == 1,]$t_inc))

x.plot <- ggplot(data = acf.week1, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Autocorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

adcf.temp <- adcf(x = epid_data[epid_data$weeks == 1,]$t_inc,lag.range=1:10,nreps=200)
adcf.week1 <- data.frame(lag = adcf.temp$lag, adcf = adcf.temp$adcf)
ciline_u <- adcf.temp$u[1]
ciline_l <- adcf.temp$l[1]

x.plot <- ggplot(data = adcf.week1, aes(x = lag , y = adcf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Auto Distance Correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

acf.temp <- acf(x = epid_data[epid_data$weeks == 2,]$t_inc,lag.max = 10, plot = F)
acf.week2 <- data.frame(lag = acf.temp$lag, acf = acf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(epid_data[epid_data$weeks == 2,]$t_inc))

x.plot <- ggplot(data = acf.week2, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Autocorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

adcf.temp <- adcf(x = epid_data[epid_data$weeks == 2,]$t_inc,lag.range=1:10,nreps=200)
adcf.week2 <- data.frame(lag = adcf.temp$lag, adcf = adcf.temp$adcf)
ciline_u <- adcf.temp$u[1]
ciline_l <- adcf.temp$l[1]

x.plot <- ggplot(data = adcf.week2, aes(x = lag , y = adcf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Auto Distance Correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

acf.temp <- acf(x = epid_data[epid_data$weeks == 3,]$t_inc,lag.max = 10, plot = F)
acf.week3 <- data.frame(lag = acf.temp$lag, acf = acf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(epid_data[epid_data$weeks == 3,]$t_inc))

x.plot <- ggplot(data = acf.week3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Autocorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

adcf.temp <- adcf(x = epid_data[epid_data$weeks == 3,]$t_inc,lag.range=1:10,nreps=200)
adcf.week3 <- data.frame(lag = adcf.temp$lag, adcf = adcf.temp$adcf)
ciline_u <- adcf.temp$u[1]
ciline_l <- adcf.temp$l[1]

x.plot <- ggplot(data = adcf.week3, aes(x = lag , y = adcf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Auto Distance Correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

ccf.temp <- acf(x = epid_data[epid_data$weeks == 1,]$t_inc,y = epid_data[epid_data$weeks == 2,]$t_inc,lag.max = 10, plot = F)
cdcf.temp <- c(x = epid_data[epid_data$weeks == 1,]$t_inc,y = epid_data[epid_data$weeks == 2,]$t_inc,lag.range=1:10,nreps=200)
ccf.week1.2 <- data.frame(lag = ccf.temp$lag, acf = ccf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(epid_data[epid_data$weeks == 3,]$t_inc))

x.plot <- ggplot(data = ccf.week1.2, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Crosscorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

cdcf.temp <- cdcf(x = epid_data[epid_data$weeks == 1,]$t_inc,y = epid_data[epid_data$weeks == 2,]$t_inc,lag.range=1:10,nreps=200)
cdcf.week1.2 <- data.frame(lag = cdcf.temp$lag, acf = cdcf.temp$cdcf)
ciline_u <- cdcf.temp$u[1]
ciline_l <- cdcf.temp$l[1]

x.plot <- ggplot(data = cdcf.week1.2, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Cross distance correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

ccf.temp <- acf(x = epid_data[epid_data$weeks == 1,]$t_inc,y = epid_data[epid_data$weeks == 3,]$t_inc,lag.max = 10, plot = F)
ccf.week1.3 <- data.frame(lag = ccf.temp$lag, acf = ccf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(epid_data[epid_data$weeks == 3,]$t_inc))

x.plot <- ggplot(data = ccf.week1.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Crosscorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

cdcf.temp <- cdcf(x = epid_data[epid_data$weeks == 1,]$t_inc,y = epid_data[epid_data$weeks == 3,]$t_inc,lag.range=1:10,nreps=200)
cdcf.week1.3 <- data.frame(lag = cdcf.temp$lag, acf = cdcf.temp$cdcf)
ciline_u <- cdcf.temp$u[1]
ciline_l <- cdcf.temp$l[1]

x.plot <- ggplot(data = cdcf.week1.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Cross distance correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

ccf.temp <- acf(x = epid_data[epid_data$weeks == 2,]$t_inc,y = epid_data[epid_data$weeks == 3,]$t_inc,lag.max = 10, plot = F)
ccf.week2.3 <- data.frame(lag = ccf.temp$lag, acf = ccf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(epid_data[epid_data$weeks == 3,]$t_inc))

x.plot <- ggplot(data = ccf.week2.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Crosscorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

cdcf.temp <- cdcf(x = epid_data[epid_data$weeks == 2,]$t_inc,y = epid_data[epid_data$weeks == 3,]$t_inc,lag.range=1:10,nreps=200)
cdcf.week2.3 <- data.frame(lag = cdcf.temp$lag, acf = cdcf.temp$cdcf)
ciline_u <- cdcf.temp$u[1]
ciline_l <- cdcf.temp$l[1]

x.plot <- ggplot(data = cdcf.week2.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Cross distance correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

Section 4

Section 4.1 : Risks of extreme rates in the next year

Define the GPD threshold

level_gpd <- c(0.9, 0.9, 0.9)
thres_gpd <- rep(NA, 3)
for (i in 1:3) {
  thres_gpd[i] <-
    quantile(incidences2018$t_inc, probs = level_gpd[i], na.rm = T)
}

The GPD threshold for Weeks 1, 2 and 3 is defined as the 0.9, 0.9, 0.9-quantile of the data, that is 339.2, 339.2, 339.2.

Add to the dataframe the excesses for extreme epidemics

`%nin%` = Negate(`%in%`)

epid_data$excess <- rep(NA, length(epid_data$season))

for (j in unique(epid_data$season)) {
  temp <- epid_data[epid_data$season == j,]
  if (sum(temp$t_inc > thres_gpd, na.rm = T) == 0) {
    epid_data[epid_data$season == j,]$excess <- rep(NA, d)
  }
  else {
    epid_data[epid_data$season == j,]$excess <-
      temp$t_inc - thres_gpd
  }
}

Correlation plots for excesses

excess1 <- epid_data[(epid_data$weeks == 1),]$excess
acf.temp <- acf(x = excess1[!is.na(excess1)],lag.max = 10, plot = F)
acf.excess1 <- data.frame(lag = acf.temp$lag, acf = acf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(excess1[!is.na(excess1)]))

x.plot <- ggplot(data = acf.excess1, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Autocorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

adcf.temp <- adcf(x = excess1[!is.na(excess1)],lag.range=1:10,nreps=200)
adcf.excess1 <- data.frame(lag = adcf.temp$lag, adcf = adcf.temp$adcf)
ciline_u <- adcf.temp$u[1]
ciline_l <- adcf.temp$l[1]

x.plot <- ggplot(data = adcf.excess1, aes(x = lag , y = adcf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Auto Distance Correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

excess2 <- epid_data[(epid_data$weeks == 2),]$excess
acf.temp <- acf(x = excess2[!is.na(excess2)],lag.max = 10, plot = F)
acf.excess2 <- data.frame(lag = acf.temp$lag, acf = acf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(excess2[!is.na(excess2)]))

x.plot <- ggplot(data = acf.excess2, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Autocorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

adcf.temp <- adcf(x = excess2[!is.na(excess2)],lag.range=1:10,nreps=200)
adcf.excess2 <- data.frame(lag = adcf.temp$lag, adcf = adcf.temp$adcf)
ciline_u <- adcf.temp$u[1]
ciline_l <- adcf.temp$l[1]

x.plot <- ggplot(data = adcf.excess2, aes(x = lag , y = adcf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Auto Distance Correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

excess3 <- epid_data[(epid_data$weeks == 3),]$excess
acf.temp <- acf(x = excess3[!is.na(excess3)],lag.max = 10, plot = F)
acf.excess3 <- data.frame(lag = acf.temp$lag, acf = acf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(excess3[!is.na(excess3)]))

x.plot <- ggplot(data = acf.excess3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Autocorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

adcf.temp <- adcf(x = excess3[!is.na(excess3)],lag.range=1:10,nreps=200)
adcf.excess3 <- data.frame(lag = adcf.temp$lag, adcf = adcf.temp$adcf)
ciline_u <- adcf.temp$u[1]
ciline_l <- adcf.temp$l[1]

x.plot <- ggplot(data = adcf.excess3, aes(x = lag , y = adcf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Auto Distance Correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

ccf.temp <- acf(x = excess1[!is.na(excess1)],y = excess2[!is.na(excess2)],lag.max = 10, plot = F)
ccf.excess1.2 <- data.frame(lag = ccf.temp$lag, acf = ccf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(excess1))

x.plot <- ggplot(data = ccf.excess1.2, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Crosscorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

cdcf.temp <- cdcf(x = excess1[!is.na(excess1)],y = excess2[!is.na(excess2)],lag.range=1:10,nreps=200)
cdcf.excess1.2 <- data.frame(lag = cdcf.temp$lag, acf = cdcf.temp$cdcf)
ciline_u <- cdcf.temp$u[1]
ciline_l <- cdcf.temp$l[1]

x.plot <- ggplot(data = cdcf.excess1.2, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Cross distance correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

ccf.temp <- acf(x = excess1[!is.na(excess1)],y = excess3[!is.na(excess3)],lag.max = 10, plot = F)
ccf.excess1.3 <- data.frame(lag = ccf.temp$lag, acf = ccf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(excess1))

x.plot <- ggplot(data = ccf.excess1.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Crosscorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

cdcf.temp <- cdcf(x = excess1[!is.na(excess1)],y = excess3[!is.na(excess3)],lag.range=1:10,nreps=200)
cdcf.excess1.3 <- data.frame(lag = cdcf.temp$lag, acf = cdcf.temp$cdcf)
ciline_u <- cdcf.temp$u[1]
ciline_l <- cdcf.temp$l[1]

x.plot <- ggplot(data = cdcf.excess1.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Cross distance correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

ccf.temp <- acf(x = excess2[!is.na(excess2)],y = excess3[!is.na(excess3)],lag.max = 10, plot = F)
ccf.excess2.3 <- data.frame(lag = ccf.temp$lag, acf = ccf.temp$acf)
ciline <- qnorm((1 - 0.95)/2)/sqrt(length(excess1))

x.plot <- ggplot(data = ccf.excess2.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = -ciline, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Crosscorrelation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

cdcf.temp <- cdcf(x = excess2[!is.na(excess2)],y = excess3[!is.na(excess3)],lag.range=1:10,nreps=200)
cdcf.excess2.3 <- data.frame(lag = cdcf.temp$lag, acf = cdcf.temp$cdcf)
ciline_u <- cdcf.temp$u[1]
ciline_l <- cdcf.temp$l[1]

x.plot <- ggplot(data = cdcf.excess2.3, aes(x = lag , y = acf)) + geom_bar(stat = 'identity', width = 0.03)
x.plot <- x.plot + geom_hline(yintercept = ciline_u, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = ciline_l, color = "blue",linetype =2)
x.plot <- x.plot + geom_hline(yintercept = 0, size = 0.2)
x.plot <-
  x.plot + xlab("Lag") + ylab("Cross distance correlation")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + scale_x_continuous(breaks = seq(0, 10, 2)) +scale_y_continuous(breaks = seq(-1, 1, .2))
x.plot

Fit an exponential distribution

univ.fit_exp <- list()
sigma_univ <- rep(0, d)
df.param0 <- c(NA, NA, NA)
for (i in 1:(d)) {
  week <-
    epid_data[(epid_data$weeks == i) &
                !is.na(epid_data$t_inc),]$t_inc
  univ.fit_exp[[i]] <-
    fevd(
      x = week,
      threshold = thres_gpd[i],
      type = "Exponential",
      method = "MLE"
    )
  sigma_univ[i] <- univ.fit_exp[[i]]$results$par[1]
  ci.univ.fit <-
    ci.fevd(univ.fit_exp[[i]], 0.05, type = 'parameter')
  df.param0 <- rbind(df.param0, ci.univ.fit)
}
df.param0 <- df.param0[-1, ]

Table 1 Weeks 1,2 and 3

rownames(df.param0) <- c("Week1", "Week2", "Week3")
kable(df.param0)
95% lower CI scale 95% upper CI
Week1 40.4452 72.0000 103.5548
Week2 166.1306 256.3806 346.6307
Week3 251.5334 391.7000 531.8666

LR tests for testing \(\gamma=0\)

univ.fit_gpd <- list()
for (i in 1:d) {
  week <-
    epid_data[(epid_data$weeks == i) &
                    !is.na(epid_data$t_inc), ]$t_inc
  univ.fit_gpd[[i]]<-   fevd(
      x = week,
      threshold = thres_gpd[i],
      type = "GP",
      method = "MLE"
    )
}

pvalue <- rep(0, 3)
for (i in 1:d) {
  lr <- lr.test(univ.fit_gpd[[i]],univ.fit_exp[[i]])
  pvalue[i] <- lr$p.value
}

The p-values of the LR tests for \(\gamma = 0\) for Weeks 1, 2 and 3 are equal to 0.6568338, 0.5730354, 0.6410537 respectively.

Exponential qqplots (Supplementary material)

week1 <-
  epid_data[(epid_data$weeks == 1) &
              !is.na(epid_data$t_inc),]$t_inc
obs1 <- week1[week1 > thres_gpd[1]] - thres_gpd[1]
theo1  <-
  qevd(
    p = ppoints(length(obs1)),
    scale = sigma_univ[1],
    threshold = 0,
    type = "Exponential"
  )

df_qq1 <- data.frame(Obs = sort(obs1), Theo = theo1)
qq.plot1 <-
  ggplot(data = df_qq1, aes(x = Obs, y = Theo)) + geom_point(shape = 1, size = 2.5)
qq.plot1 <-
  qq.plot1 + geom_abline(
    slope = 1,
    intercept = 0,
    col = "blue",
    linetype = 2,
    size = 0.5
  )
qq.plot1 <- qq.plot1 + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
qq.plot1  <-  qq.plot1 + xlim(0, 300) + ylim(0 , 300) + coord_fixed()
qq.plot1 <- qq.plot1 + xlab("Observed") + ylab("Theorical")
qq.plot1

week2 <-
  epid_data[(epid_data$weeks == 2) &
              !is.na(epid_data$t_inc),]$t_inc
obs2 <- week2[week2 > thres_gpd[2]] - thres_gpd[2]
theo2  <-
  qevd(
    p = ppoints(length(obs2)),
    scale = sigma_univ[2],
    threshold = 0,
    type = "Exponential"
  )

df_qq2 <- data.frame(Obs = sort(obs2), Theo = theo2)
qq.plot2 <-
  ggplot(data = df_qq2, aes(x = Obs, y = Theo)) + geom_point(shape = 1, size = 2.5)
qq.plot2 <-
  qq.plot2 + geom_abline(
    slope = 1,
    intercept = 0,
    col = "blue",
    linetype = 2,
    size = 0.5
  )
qq.plot2 <- qq.plot2 + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
qq.plot2  <-  qq.plot2 + xlim(0, 1100) + ylim(0 , 1100) + coord_fixed()
qq.plot2 <- qq.plot2 + xlab("Observed") + ylab("Theorical")
qq.plot2

week3 <-
  epid_data[(epid_data$weeks == 3) &
              !is.na(epid_data$t_inc),]$t_inc
obs3 <- week3[week3 > thres_gpd[3]] - thres_gpd[3]
theo3  <-
  qevd(
    p = ppoints(length(obs3)),
    scale = sigma_univ[3],
    threshold = 0,
    type = "Exponential"
  )

df_qq3 <- data.frame(Obs = sort(obs3), Theo = theo3)
qq.plot3 <-
  ggplot(data = df_qq3, aes(x = Obs, y = Theo)) + geom_point(shape = 1, size = 2.5)
qq.plot3 <-
  qq.plot3 + geom_abline(
    slope = 1,
    intercept = 0,
    col = "blue",
    linetype = 2,
    size = 0.5
  )
qq.plot3 <- qq.plot3 + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
qq.plot3  <-
qq.plot3 + xlim(0, 1650) + ylim(0 , 1650) +  coord_fixed()
qq.plot3 <- qq.plot3 + xlab("Observed") + ylab("Theorical")
qq.plot3

Table 2 (Week3)

nb.excess.Week3 <- length(obs3)
p.u <- nb.excess.Week3 / nb.epid

#One year - 10% #
n <- 1
alpha <- 0.1
R1.10 <-
  thres_gpd[3] + sigma_univ[3] * (log(p.u) - log(1 - (1 - alpha) ^ (1 / n)))
#One year - 1% #
n <- 1
alpha <- 0.01
R1.1 <-
  thres_gpd[3] + sigma_univ[3] * (log(p.u) - log(1 - (1 - alpha) ^ (1 / n)))

#10 years - 10% #
n <- 10
alpha <- 0.1
R10.10 <-
  thres_gpd[3] + sigma_univ[3] * (log(p.u) - log(1 - (1 - alpha) ^ (1 / n)))

#10 years - 1% #
n <- 10
alpha <- 0.01
R10.1 <-
  thres_gpd[3] + sigma_univ[3] * (log(p.u) - log(1 - (1 - alpha) ^ (1 / n)))

risks <- data.frame(R1.10, R1.1, R10.10, R10.1)
rownames(risks) <- c("risks")
colnames(risks) <- c("1 year - 10%", "1 year - 1%", "10 years - 10%", "10 years - 1%")
kable(risks)
1 year - 10% 1 year - 1% 10 years - 10% 10 years - 1%
risks 1192.096 2094.019 2075.627 2994.171

The estimate of \(\widehat p_u\) is equal to 0.8823529.

Section 4.2

Standardize the excesses

epid_data$excess_stand <- rep(NA, length(epid_data$season))
for (i in 1:d) {
  epid_data[epid_data$weeks == i,]$excess_stand <-
    epid_data[epid_data$weeks == i,]$excess / sigma_univ[i]
}

Consider only extreme epidemics

season_NA <-
  unique(subset(x = epid_data, is.na(epid_data$excess))$season)
epid_data_ssNA <- subset(x = epid_data, season %nin% season_NA)


excess_stand_matrix <-
  matrix(epid_data_ssNA$excess_stand,
         ncol = d,
         byrow = T)
nb.epid_extreme <-  sum(!is.na(epid_data$excess))/3

The number of extreme epidemics is 32.

Model selection

For each model, we do the optimisation 50 times and then choose the parameters of the iteration which has the smaller -log(LLK). Since it takes a lot of time, we save the results in the file.

Fit a MGPD Gumbel model
#  nb.run <- 50
# fit.Gumbel_fun <- function(i) {
#    library('pracma')
# alpha0 <- runif(d, 1.2, 50)
# b0 <- runif(d - 1, -1, 1) # if beta1 is fixed to 0 beta2 and beta 3 are close to 0
# beta1 <- 0
# 
# fitGumbelU_standard <- optim(
#     par = c(alpha0, b0),
#     fn = LLKGumbelMGPDstand_U,
#     x = excess_stand_matrix,
#     beta1 = beta1,
#     lw = c(1.1,-2),
#     up = c(100, 2),
#     control = list(maxit = 1e7, reltol = 1e-15)
#   )
# 
#    return(
#      c(
#        fitGumbelU_standard$par[1:d],
#        fitGumbelU_standard$par[-(1:d)],
#        fitGumbelU_standard$value
#      )
# )
# }
# 
# 
#  x_gumbel <- foreach(i = 1:nb.run) %dopar%  fit.Gumbel_fun(i)
#  res.Gumbel  <-
#    matrix(unlist(x_gumbel),
#           nrow = nb.run,
#           ncol = 2 * d,
#           byrow = T)
# 
#  res.Gumbel <- as.data.frame(res.Gumbel)
#  colnames(res.Gumbel) <-
#    c("alpha1", "alpha2", "alpha3", "beta2", "beta3", "LLK")
#  bestGumbel <- which.min(res.Gumbel$LLK)
# 
# write.csv(res.Gumbel, file = 'res.GumbelFitWeek3.csv')
res.Gumbel <- read.csv(file = "res.GumbelFitWeek3.csv", header = T, sep =",")
res.Gumbel <- res.Gumbel[,-1]
bestGumbel <- which.min(res.Gumbel$LLK)

AIC_Gumbel <-
  2 * (dim(res.Gumbel)[2]-1) + 2 * res.Gumbel[bestGumbel, d + 3]
BIC_Gumbel <-
  log(dim(excess_stand_matrix)[1]) * (dim(res.Gumbel)[2]-1) + 2 *
  res.Gumbel[bestGumbel, d + 3]
Fit a MGPD reverse Gumbel model

Note that the reserve Gumbel model can be very instable but AIC and BIC are always much larger than the other two models

# nb.run <- 50
# fit.RevGumbel_fun <- function(i) {
#   library('pracma')
# alpha0 <- runif(d, 1.2, 2)
# b0 <- rep(0.1, 2)
# beta1 <- 0
# 
# fitRevGumbelU_standard <-
#   optim(
#     par = c(alpha0, b0),
#     fn = LLKRevGumbelMGPDstand_U,
#     x = excess_stand_matrix,
#     beta1 = beta1,
#     lw = c(1e-12,-1),
#     up = c(100, 1),
#     control = list(maxit = 1e7, reltol = 1e-15)
#   )
# 
#   return(
#     c(
#       fitRevGumbelU_standard$par[1:d],
#       fitRevGumbelU_standard$par[-(1:d)],
#       fitRevGumbelU_standard$value
#     )
#   )
# }
# x_revgumbel <- foreach(i = 1:nb.run) %dopar%  fit.RevGumbel_fun(i)
# res.RevGumbel  <-
#   matrix(
#     unlist(x_revgumbel),
#     nrow = nb.run,
#     ncol = 2 * d,
#     byrow = T
#   )
# 
#  res.RevGumbel <- as.data.frame(res.RevGumbel)
#  colnames(res.RevGumbel) <-
#    c("alpha1", "alpha2", "alpha3", "beta2", "beta3", "LLK")
# 
# write.csv(res.RevGumbel, file = 'res.RevGumbelFitWeek3.csv')
res.RevGumbel <- read.csv(file = "res.RevGumbelFitWeek3.csv", header = T, sep =",")
res.RevGumbel <- res.RevGumbel[,-1]
bestRevGumbel <- which.min(res.RevGumbel$LLK)
AIC_RevGumbel <-
  2 *(dim(res.RevGumbel)[2]-1) + 2 * res.RevGumbel[bestRevGumbel, d +
                                                     3]
BIC_RevGumbel <-
  log(dim(excess_stand_matrix)[1]) *(dim(res.RevGumbel)[2]-1)  + 2 *
  res.RevGumbel[bestRevGumbel, d + 3]
Fit a MGPD reverse Exponential model
#  nb.run <- 50
#  fit.RevExpo_fun <- function(i) {
#    library('pracma')
#   alpha0 <- runif(d, 0.2, 5)
# b0 <- runif(d - 1, -2.5, 2.5)
# beta1 <- 0
# 
# fitRevExpoU_standard <- optim(
#   par = c(alpha0, b0),
#   fn = LLKRevExpoMGPDstand_U,
#   x = excess_stand_matrix,
#   beta1 = beta1,
#   lw = c(0.1,-3),
#   up = c(100, 3),
#   control = list(maxit = 1e7, reltol = 1e-15)
# )
#    return(
#      c(
#       fitRevExpoU_standard$par[1:d],
#        fitRevExpoU_standard$par[-(1:d)],
#       fitRevExpoU_standard$value
#      )
#    )
#  }
# 
# x_revexpo <- foreach(i = 1:nb.run) %dopar%  fit.RevExpo_fun(i)
#  res.RevExpo <-
#    matrix(unlist(x_revexpo),
#           nrow = nb.run,
#           ncol = 2 * d,
#           byrow = T)
# 
#  res.RevExpo <- as.data.frame(res.RevExpo)
#  colnames(res.RevExpo) <-
#    c("alpha1", "alpha2", "alpha3", "beta2", "beta3", "LLK")
# 
# write.csv(res.RevExpo, file = 'res.RevExpoFitWeek3.csv')
res.RevExpo <- read.csv(file = "res.RevExpoFitWeek3.csv", header = T, sep =",")
res.RevExpo <- res.RevExpo[,-1]
 bestRevExpo <- which.min(res.RevExpo$LLK)
 AIC_RevExpo <-
   2 * (dim(res.RevExpo)[2]-1)  + 2 * res.RevExpo[bestRevExpo, d + 3]
 BIC_RevExpo <-
   log(dim(excess_stand_matrix)[1]) * (dim(res.RevExpo)[2]-1)  + 2 *
   res.RevExpo[bestRevExpo, d + 3]

Table 3 a)

tab3 <- data.frame(matrix(c(AIC_Gumbel,BIC_Gumbel, AIC_RevGumbel, BIC_RevGumbel,AIC_RevExpo,   BIC_RevExpo), ncol = 3))
colnames(tab3) <- c('Gumbel', 'RevGumbel', 'RevExpo')
rownames(tab3) <- c('AIC', 'BIC')
kable(tab3)
Gumbel RevGumbel RevExpo
AIC 194.4243 2188.914 207.7058
BIC 201.7530 2196.243 215.0345

Selected model = Gumbel model

est.alpha_M1 <- as.numeric(res.Gumbel[bestGumbel, 1:d])
est.beta_M1 <- c(0, as.numeric(res.Gumbel[bestGumbel, (d + 1):(d + 2)]))

LLKGumbel <- as.numeric(res.Gumbel[bestGumbel, d+3])

Model simplification

\(\beta\) fixed equal to 0

alpha0 <- est.alpha_M1
beta <- rep(0, d)

fitGumbelU_standard_alpha <-
  optim(
    par = alpha0,
    fn = LLKGumbelMGPDstand_U_alpha,
    x = excess_stand_matrix,
    beta = beta,
    lw = c(1.1),
    up = c(Inf),
    control = list(maxit = 1e4, reltol = 1e-10)
  )

AIC_M2 <-
  2 * 5 + 2 * fitGumbelU_standard_alpha$value
BIC_M2 <-
  log(dim(excess_stand_matrix)[1]) * 5 +
  2 * fitGumbelU_standard_alpha$value

T <-
  -2 * (LLKGumbel - fitGumbelU_standard_alpha$value)
LR_M2 <-
  pchisq(q = T,
         df = (5 - length(fitGumbelU_standard_alpha$par)),
         lower.tail = F)

all \(\alpha\) are equal

a0 <- mean(est.alpha_M1)
b0 <- est.beta_M1[2:3]
beta1 <- 0

fitGumbelU_standard_abeta <-
  optim(
    par = c(a0, b0),
    fn = LLKGumbelMGPDstand_U_abeta,
    x = excess_stand_matrix,
    beta1 = beta1,
    lw = c(1.1,-4),
    up = c(Inf, 4),
    control = list(maxit = 1e4, reltol = 1e-10)
  )


AIC_M3 <-
  2 * 5 + 2 * fitGumbelU_standard_abeta$value
BIC_M3 <-
  log(dim(excess_stand_matrix)[1]) * 5 +
  2 * fitGumbelU_standard_abeta$value

T <-
  -2 * (LLKGumbel - fitGumbelU_standard_abeta$value)
LR_M3 <-
  pchisq(
    q = T,
    df = 5 - length(fitGumbelU_standard_abeta$par),
    lower.tail = F
  )

\(\beta\) fixed equal to 0 and all \(\alpha\) equal

 a0 <- mean(est.alpha_M1)
 beta <- rep(0, d)

 fitGumbelU_standard_a <- optimize(
   f = LLKGumbelMGPDstand_U_a,
   x = excess_stand_matrix,
   beta = beta,
   lw = c(1.1),
   up = c(Inf),
   interval = c(1.1, 10)
 )

 AIC_M4 <- 2 + 2 * fitGumbelU_standard_a$objective
 BIC_M4 <-
   log(dim(excess_stand_matrix)[1]) + 2 * fitGumbelU_standard_a$objective

 T <- -2 * (LLKGumbel - fitGumbelU_standard_a$objective)
 LR_M4 <-
   pchisq(
     q = T,
     df = 5 - length(fitGumbelU_standard_a$par),
     lower.tail = F
   )
 
 LR_M4
## [1] 1.915197e-12

Table for model simplification (supplementary material)

tab4 <- data.frame(matrix(c(AIC_Gumbel,AIC_M2,AIC_M3,AIC_M4,BIC_Gumbel, BIC_M2, BIC_M3,BIC_M4,NA,LR_M2, LR_M3, LR_M4), ncol = 3))
colnames(tab4) <- c('AIC', 'BIC', 'LR')
rownames(tab4) <- c('M1', 'M2', 'M3','M4')
kable(tab4, digits = 20)
AIC BIC LR
M1 194.4243 201.7530 NA
M2 257.3854 264.7141 2.129006e-14
M3 220.1578 227.4865 2.582474e-06
M4 250.3020 251.7677 1.915197e-12

Selected model : M1

Table : estimates of the parameters

est.matrix_M1 <- rbind(est.alpha_M1, est.beta_M1)
write.csv(est.matrix_M1, file = "EstimatesParametersWeek3.csv")
est.M1 <- data.frame(est.matrix_M1)
rownames(est.M1) <- c("alpha", "beta")
colnames(est.M1) <- c("1", "2", "3")
kable(est.M1)
1 2 3
alpha 2.220851 10.3661425 3.2111572
beta 0.000000 0.8367395 0.5957117

Prediction of year 2019

Estimation of the probability to be an extreme episode when x1 and x2 <0

den <- 0
num <- 0
for (i in 1985:2018) {
  temp <- epid_data[epid_data$season == i, ]
  den <-
    den + (temp[temp$weeks == 1,]$t_inc < thres_gpd[1]) * (temp[temp$weeks == 2,]$t_inc < thres_gpd[2])
  num <-
    num + (temp[temp$weeks == 1,]$t_inc < thres_gpd[1]) * (temp[temp$weeks == 2,]$t_inc < thres_gpd[2]) * (temp[temp$weeks == 3,]$t_inc > thres_gpd[3])
}

proba_extreme <- num / den
write.csv(proba_extreme, file = "ProbaExtremeWeek3.csv")

2019 standardized excesses

incidences2019 <- subset(x = incidences, subset = (season == 2019))
epid_data2019 <-
  EpidemicDataFrame(data = incidences2019, d = 3, thres = thres_epid)

epid_data2019$excess <- rep(NA, length(epid_data2019$season))
epid_data2019$excess_stand <- rep(NA, length(epid_data2019$season))

for (j in unique(epid_data2019$season)) {
  temp <- epid_data2019[epid_data2019$season == j, ]
  if (sum(temp$t_inc > thres_gpd, na.rm = T) == 0) {
    epid_data2019[epid_data2019$season == j, ]$excess <- rep(NA, d)
  }
  else {
    epid_data2019[epid_data2019$season == j, ]$excess <-
      temp$t_inc- thres_gpd
  }
}

for (i in 1:d) {
  epid_data2019[epid_data2019$weeks == i, ]$excess_stand <-
    epid_data2019[epid_data2019$weeks == i, ]$excess / sigma_univ[i]
}

Fix exceedances thresholds

We will estimate the probabilty that the Week 3 ILI incidence rates will exceed the following thresholds defined as multiple of the observed maximum of ILI rates of the third weeks of all epidemics (which is equal to 1729).

thres <- max(epid_data[epid_data$weeks == 3, ]$t_inc)
thres_stand <- (thres-thres_gpd[3])/sigma_univ[3]
mult_pred <- c(0.5, 0.75,0.95,1,1.2)

thres_pred <- mult_pred*thres_stand
write.csv(thres_pred, file="FixedPredThresWeek3.csv")

thres_max <- data.frame(t(mult_pred*thres))
colnames(thres_max) <- c("0.5", "0.75", "0.95", "1", "1.2")
rownames(thres_max) <- c( "thresholds")
kable(thres_max)
0.5 0.75 0.95 1 1.2
thresholds 864.5 1296.75 1642.55 1729 2074.8

Table 4 a)

proba.pred <- matrix(NA, nrow = 1, ncol = length(thres_pred))
proba.predict_GPD <- matrix(NA, nrow = 1, ncol = length(thres_pred))
proba.predict_LOGIT <-
  matrix(NA, nrow = 1, ncol = length(thres_pred))

topredict_2019 <-
  data.frame(week1 = epid_data2019[epid_data2019$weeks == 1,]$excess_stand,
             week2 = epid_data2019[epid_data2019$weeks == 2,]$excess_stand)

x12 <-
  2 * c(epid_data2019[epid_data2019$weeks == 1, ]$excess_stand, epid_data2019[epid_data2019$weeks == 2, ]$excess_stand)

for (i in 1:length(thres_pred)) {
  proba.predict_GPD[, i] <-
    excessprobGumbelU_3g12(
      v3 = thres_pred[i],
      x12 = x12,
      alpha = est.alpha_M1,
      beta = est.beta_M1
    ) * proba_extreme_3g12(x12 = x12)
  
}
 reg_2019 <-
    matrix(data = c(epid_data_ssNA[epid_data_ssNA$weeks == 1, ]$excess_stand,
       epid_data_ssNA[epid_data_ssNA$weeks == 2, ]$excess_stand),ncol = 2)
    
for (i in 1:2){

      response <- as.numeric(epid_data_ssNA[epid_data_ssNA$weeks == 3, ]$excess_stand > thres_pred[i])
    
    
  fit <- glmnet(x = reg_2019, y = response, family = "binomial", alpha = 0, lambda = 1)

  proba.predict_LOGIT[, i] <-  predict(object = fit,
                                       newx = matrix(x12, ncol =2),
                                       type = "response")
}
proba.pred <- data.frame(matrix(c(thres*mult_pred,
                    proba.predict_GPD,
                    proba.predict_LOGIT), ncol =  5, byrow = T))
colnames(proba.pred) <- c("0.5", "0.75", "0.95", "1", "1.2")
rownames(proba.pred) <- c("Threshold", "GP", "Logistic")
kable(proba.pred)
0.5 0.75 0.95 1 1.2
Threshold 864.5000000 1296.7500000 1.64255e+03 1.729e+03 2.0748e+03
GP 0.1851058 0.0119246 1.22860e-03 6.952e-04 7.1200e-05
Logistic 0.1743116 0.1041239 NA NA NA

Section 5.2 i) : leave-one-out cross-validation

Data + GPD thresholds

epid_data_8519 <- rbind(epid_data, epid_data2019)
nb.epid <- length(unique(epid_data_8519$season))

level_gpd <- c(0.9, 0.9, 0.9)
thres_gpd <- rep(NA, 3)
for (i in 1:3) {
  thres_gpd[i] <-
    quantile(incidences$t_inc, probs = level_gpd[i], na.rm = T)
}

Create train/tests data sets

train_list <- list()
test_list <- list()

for (i in 1:nb.epid) {
  season_i <- unique(epid_data_8519$season)[i]
  index.remove <- which(epid_data_8519$season == season_i)
  epid_data_test <-
    subset(x = epid_data_8519, subset = season == season_i)
  epid_data_LOO <- epid_data_8519[-index.remove,]
 
  #Marginal Expo Fit
  sigma_univ <- rep(0, d)
  df.param_univ <- rep(NA, d)
  for (k in 1:d) {
    week <-
      epid_data_LOO[(epid_data_LOO$weeks == k) &
                      !is.na(epid_data_LOO$t_inc), ]$t_inc
    univ.fit <-
      fevd(
        x = week,
        threshold = thres_gpd[k],
        type = "Exponential",
        method = "MLE"
      )
    sigma_univ[k] <- univ.fit$results$par[1]
  }
  #Create excesses
  `%nin%` = Negate(`%in%`)
  epid_data_LOO$excess <- rep(NA, length(epid_data_LOO$season))
  epid_data_LOO$excess_stand <-
    rep(NA, length(epid_data_LOO$season))
  for (j in epid_data_LOO$season) {
    temp <- epid_data_LOO[epid_data_LOO$season == j,]
    if (sum(temp$t_inc > thres_gpd[1:d], na.rm = T) == 0) {
      epid_data_LOO[epid_data_LOO$season == j,]$excess <- rep(NA, d)
    }
    else {
      epid_data_LOO[epid_data_LOO$season == j,]$excess <-
        temp$t_inc - thres_gpd[1:d]
    }
  }
  
  for (k in 1:d) {
    epid_data_LOO[epid_data_LOO$weeks == k,]$excess_stand <-
      epid_data_LOO[epid_data_LOO$weeks == k,]$excess / sigma_univ[k]
  }
  season_NA_LOO <-
    unique(subset(x = epid_data_LOO, is.na(epid_data_LOO$excess))$season)
  train_list[[i]] <-
    subset(x = epid_data_LOO, season %nin% season_NA_LOO)
  
  epid_data_test$excess <- rep(0, 3)
  epid_data_test$excess_stand <- rep(0, 3)
  for (k in 1:d) {
    epid_data_test[epid_data_test$weeks == k,]$excess <-
      epid_data_test[epid_data_test$weeks == k,]$t_inc - thres_gpd[k]
    epid_data_test[epid_data_test$weeks == k,]$excess_stand <-
      epid_data_test[epid_data_test$weeks == k,]$excess / sigma_univ[k]
  }
  test_list[[i]] <- epid_data_test
}

Leave-one-out fit

LOO_fit_parallel <- foreach(
  i = 1:nb.epid,
  .packages = c("extRemes", "pracma"),
  .combine = rbind
) %dopar% {
  LOO_fit(i,train_list = train_list)
}

res.fitLOO_week3 <- data.frame(LOO_fit_parallel)
colnames(res.fitLOO_week3) <- c("Indice", "alpha1", "alpha2","alpha3", "beta1", "beta2", "beta3")

est.alpha_list <- list()
for (i in 1:nb.epid) {
  est.alpha_list [[i]] <-
    c(res.fitLOO_week3[i,]$alpha1,
      res.fitLOO_week3[i,]$alpha2,
      res.fitLOO_week3[i,]$alpha3)
}

est.beta_list <- list()
for (i in 1:nb.epid) {
  est.beta_list [[i]] <-
    c(res.fitLOO_week3[i,]$beta1,
      res.fitLOO_week3[i,]$beta2,
      res.fitLOO_week3[i,]$beta3)
}

Leave-one-out prediction

res_parallel_GPD <- foreach(
  i = 1:nb.epid,
  .packages = c("extRemes", "pracma"),
  .combine = rbind
) %dopar% {
  pred_parallel_GPD(i, train_list = train_list, test_list = test_list, thres_pred = thres_pred)
}


res_week3_LOO_GPD <- data.frame(res_parallel_GPD)
colnames(res_week3_LOO_GPD) <-
  c(
    "Week3",
    "Predict_Gumbel_05",
    "Predict_Gumbel_075",
    "Predict_Gumbel_095",
    "Predict_Gumbel_1",
    "Predict_Gumbel_12",
    "LLK"
  )

res_week3_LOO_GPD$season <- 1985:2019
res_parallel_LOGIT<- foreach(
  i = 1:nb.epid,
  .packages = c("glmnet"),
  .combine = rbind
) %dopar% {
  pred_parallel_LOGIT(i, train_list = train_list, test_list = test_list, thres_pred = thres_pred)
}


res_week3_LOO_LOGIT <- data.frame(res_parallel_LOGIT)
colnames(res_week3_LOO_LOGIT) <-
  c(
    "Predict_Logit_05",
    "Predict_Logit_075",
    "Predict_Logit_095",
    "Predict_Logit_1",
    "Predict_Logit_12"
  )

res_week3_LOO <- cbind(res_week3_LOO_GPD, res_week3_LOO_LOGIT)

Prediction probabilities plots (Figures 4a) and b))

Threshold = 0.5 \(\times\) max = 864 (Figure 4a))

res_week3_05 <-
  subset(res_week3_LOO,
         select = c(Week3, Predict_Gumbel_05, Predict_Logit_05))
res_week3_05$Outcome <-
  as.numeric(res_week3_05$Week3 > thres_pred[1])


x.plot <-
  ggplot(data = res_week3_05, aes(x = as.factor(Outcome), y = Predict_Gumbel_05))
x.plot <-
  x.plot  +  geom_point(shape = 1, size = 3)
x.plot <- x.plot + ggtitle("GP prediction")
x.plot <-
  x.plot + xlab("Outcome") + ylab("Prediction probability")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + ylim(0,1)

y.plot <-
  ggplot(data = res_week3_05, aes(x = as.factor(Outcome), y = Predict_Logit_05))
y.plot <-
  y.plot +  geom_point(shape = 1, size = 3)
y.plot <-
  y.plot + xlab("Outcome") + ylab("Prediction probability")
y.plot <- y.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
y.plot <- y.plot + ggtitle("Logistic regression")
y.plot <- y.plot + ylim(0,1)

z.plot <- grid.arrange(x.plot, y.plot, ncol = 2)

Threshold = 0.75 \(\times\) max = 1,297 (Figure 4b))

res_week3_075 <-
  subset(res_week3_LOO,
         select = c(Week3, Predict_Gumbel_075, Predict_Logit_075))
res_week3_075$Outcome <-
  as.numeric(res_week3_075$Week3 > thres_pred[2])


x.plot <-
  ggplot(data = res_week3_075, aes(x = as.factor(Outcome), y = Predict_Gumbel_075))
x.plot <-
  x.plot + geom_point(shape = 1, size = 3)
x.plot <- x.plot + ggtitle("GP prediction")
x.plot <-
  x.plot + xlab("Outcome") + ylab("Prediction probability")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + ylim(0,1)

y.plot <-
  ggplot(data = res_week3_075, aes(x = as.factor(Outcome), y = Predict_Logit_075))
y.plot <-
  y.plot +  geom_point(shape = 1, size = 3)
y.plot <- y.plot + ggtitle("Logistic regression")
y.plot <-
  y.plot + xlab("Outcome") + ylab("Prediction probability")
y.plot <- y.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
y.plot <- y.plot + ylim(0,1)


z.plot <- grid.arrange(x.plot, y.plot, ncol = 2)

Threshold = 0.95 \(\times\) max = 1,642 (not shown)

res_week3_095 <-
  subset(res_week3_LOO,
         select = c(Week3, Predict_Gumbel_095))
res_week3_095$Outcome <-
  as.numeric(res_week3_095$Week3 > thres_pred[3])


x.plot <-
  ggplot(data = res_week3_095, aes(x = as.factor(Outcome), y = Predict_Gumbel_095))
x.plot <-
  x.plot + geom_point(shape = 1, size = 3)
x.plot <- x.plot + ggtitle("GP prediction")
x.plot <-
  x.plot + xlab("Outcome") + ylab("Prediction probability")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + ylim(0,1)
x.plot

Threshold = 1 \(\times\) max = 1,729 (not shown)

res_week3_1 <-
  subset(res_week3_LOO, select = c(Week3, Predict_Gumbel_1))
res_week3_1$Outcome <- as.numeric(res_week3_1$Week3 > thres_pred[4])


x.plot <-
  ggplot(data = res_week3_1, aes(x = as.factor(Outcome), y = Predict_Gumbel_1))
x.plot <-
  x.plot + geom_point(shape = 1, size = 3)
x.plot <- x.plot + ggtitle("GP prediction")
x.plot <-
  x.plot + xlab("Outcome") + ylab("Prediction probability")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + ylim(0,1)

Threshold = 1.2 \(\times\) max = 2,075 (not shown)

res_week3_12 <-
  subset(res_week3_LOO, select = c(Week3, Predict_Gumbel_12))
res_week3_12$Outcome <-
  as.numeric(res_week3_12$Week3 > thres_pred[5])


x.plot <-
  ggplot(data = res_week3_12, aes(x = as.factor(Outcome), y = Predict_Gumbel_12))
x.plot <-
  x.plot + geom_point(shape = 1, size = 3)
x.plot <- x.plot + ggtitle("GP prediction")
x.plot <-
  x.plot + xlab("Outcome") + ylab("Prediction probability")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
x.plot <- x.plot + ylim(0,1)
x.plot

Table 6 (Week 3) - Standardized Brier scores

## 0.5
p_05 <- mean(res_week3_05$Outcome)
brier_gumbel_05 <- 1 - mean((res_week3_05$Predict_Gumbel_05 - res_week3_05$Outcome) ^ 2) /
  (p_05 * (1 - p_05))
brier_logit_05 <-  1 - mean((res_week3_05$Predict_Logit_05 - res_week3_05$Outcome) ^ 2,
           na.rm = T) /(p_05 * (1 - p_05))

## 0.75
p_075 <- mean(res_week3_075$Outcome)
brier_gumbel_075 <-
  1 - mean((res_week3_075$Predict_Gumbel_075 - res_week3_075$Outcome) ^ 2) /
  (p_075 * (1 - p_075))
brier_logit_075 <-
  1 - mean((res_week3_075$Predict_Logit_075 - res_week3_075$Outcome) ^ 2,
           na.rm = T) /
  (p_075 * (1 - p_075))

##0.95  
p_095 <- mean(res_week3_095$Outcome)
brier_gumbel_095 <-
  1 - mean((res_week3_095$Predict_Gumbel_095 - res_week3_095$Outcome) ^ 2) /
  (p_095 * (1 - p_095))
brier_logit_095 <-
  1 - mean((res_week3_095$Predict_Logit_095 - res_week3_095$Outcome) ^ 2,
           na.rm = T) /
  (p_095 * (1 - p_095))

##1
p_1 <- mean(res_week3_1$Outcome)
brier_gumbel_1 <-
  1 - mean((res_week3_1$Predict_Gumbel_1 - res_week3_1$Outcome) ^ 2) /
  (p_1 * (1 - p_1))

##1.2
p_12 <- mean(res_week3_12$Outcome)
brier_gumbel_12 <-
  1 - mean((res_week3_12$Predict_Gumbel_12 - res_week3_12$Outcome) ^ 2) /
  (p_12 * (1 - p_12))

brier_score <- data.frame(matrix(c( brier_gumbel_05, brier_gumbel_075, brier_gumbel_095, brier_gumbel_1, brier_gumbel_12, brier_logit_05, brier_logit_075, brier_logit_095, NA, NA), byrow = T, ncol = 5))
rownames(brier_score) <- c( "GP", "Logistic")
colnames(brier_score) <- c(0.5, 0.75, 0.95, 1, 1.2)
kable(brier_score)
0.5 0.75 0.95 1 1.2
GP 0.3273776 0.6862972 -0.4237232 -0.8492024 -Inf
Logistic 0.0569408 0.0247828 NaN NA NA

ROC and AUC (not shown in the paper)

Threshold = 0.5 \(\times\) max = 864

dis.thres_LOGIT <- sort(res_week3_05$Predict_Logit_05, decreasing = TRUE)
pr.data_LOGIT <-
  data.frame(DisThres = dis.thres_LOGIT, Model = rep("Logistic", length(dis.thres_LOGIT)))
for (i in 1:length(dis.thres_LOGIT)) {
  pr.data_LOGIT$Precision[i] <- sum((res_week3_05$Outcome == 1) &
                                      (res_week3_05$Predict_Logit_05 > dis.thres_LOGIT[i]) , na.rm = T 
                                    ) / sum((res_week3_05$Predict_Logit_05 > dis.thres_LOGIT[i]), na.rm = T)
  pr.data_LOGIT$Recall[i] <- sum((res_week3_05$Outcome == 1) &
                                      (res_week3_05$Predict_Logit_05 > dis.thres_LOGIT[i]), na.rm = T 
                                    ) / sum( (res_week3_05$Outcome == 1))
}


dis.thres_MGPD <- sort(res_week3_05$Predict_Gumbel_05, decreasing = TRUE)
pr.data_MGPD <-
  data.frame(DisThres = c(dis.thres_MGPD),
             Model = rep("Gumbel", length(dis.thres_MGPD)))

for (i in 1:length(dis.thres_MGPD)) {

   pr.data_MGPD$Precision[i] <- sum((res_week3_05$Outcome == 1) &
                                      (res_week3_05$Predict_Gumbel_05 > dis.thres_MGPD[i]) , na.rm = T 
                                    ) / sum((res_week3_05$Predict_Gumbel_05 > dis.thres_MGPD[i]) )
  pr.data_MGPD$Recall[i] <- sum((res_week3_05$Outcome == 1) &
                                      (res_week3_05$Predict_Gumbel_05 > dis.thres_MGPD[i]), na.rm = T 
                                    ) / sum( (res_week3_05$Outcome == 1))
}



### Precision and recall ### 
pr.data <- rbind(pr.data_MGPD, pr.data_LOGIT)

y.plot <-
  ggplot(data = pr.data, aes(
    x = Recall,
    y = Precision,
    colour = Model,
    linetype = Model
  )) + geom_path()
y.plot <- y.plot + geom_hline(yintercept = sum( (res_week3_05$Outcome == 1))/length(res_week3_05$Outcome))

y.plot <- y.plot + xlim(c(0, 1)) + ylim(c(0, 1))
y.plot <- y.plot + scale_color_manual(values = c("blue", "red"))
y.plot <- y.plot + xlab("Recall") + ylab("Precision")
y.plot <- y.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
y.plot

## AUC PR ###
auc_pr_GP_05 <- PRAUC(res_week3_05$Predict_Gumbel_05, res_week3_05$Outcome)
auc_pr_Logit_05 <- PRAUC(res_week3_05[!is.na(res_week3_05$Predict_Logit_05),]$Predict_Logit_05,res_week3_05[!is.na(res_week3_05$Predict_Logit_05),]$Outcome)

##  Average precision score ###
N <- length(pr.data_MGPD$Recall)
AP_GP_05 <- sum(pr.data_MGPD$Precision*(pr.data_MGPD$Recall - c(0,pr.data_MGPD$Recall[-N])), na.rm = T)
AP_LOGIT_05 <- sum(pr.data_LOGIT$Precision*(pr.data_LOGIT$Recall - c(0,pr.data_LOGIT$Recall[-N])), na.rm = T)

Threshold = 0.75 \(\times\) max = 1,297

dis.thres_LOGIT <- sort(res_week3_075$Predict_Logit_075, decreasing = TRUE)
pr.data_LOGIT <-
  data.frame(DisThres = dis.thres_LOGIT, Model = rep("Logistic", length(dis.thres_LOGIT)))
for (i in 1:length(dis.thres_LOGIT)) {

  pr.data_LOGIT$Precision[i] <- sum((res_week3_075$Outcome == 1) &
                                      (res_week3_075$Predict_Logit_075 > dis.thres_LOGIT[i]) , na.rm = T 
                                    ) / sum((res_week3_075$Predict_Logit_075 > dis.thres_LOGIT[i]), na.rm = T)
  pr.data_LOGIT$Recall[i] <- sum((res_week3_075$Outcome == 1) &
                                      (res_week3_075$Predict_Logit_075 > dis.thres_LOGIT[i]), na.rm = T 
                                    ) / sum( (res_week3_075$Outcome == 1))
}


dis.thres_MGPD <- sort(res_week3_075$Predict_Gumbel_075, decreasing = TRUE)
pr.data_MGPD <-
  data.frame(DisThres = c(dis.thres_MGPD),
             Model = rep("Gumbel", length(dis.thres_MGPD)))

for (i in 1:length(dis.thres_MGPD)) {
   pr.data_MGPD$Precision[i] <- sum((res_week3_075$Outcome == 1) &
                                      (res_week3_075$Predict_Gumbel_075 > dis.thres_MGPD[i]) , na.rm = T 
                                    ) / sum((res_week3_075$Predict_Gumbel_075 > dis.thres_MGPD[i]) )
  pr.data_MGPD$Recall[i] <- sum((res_week3_075$Outcome == 1) &
                                      (res_week3_075$Predict_Gumbel_075 > dis.thres_MGPD[i]), na.rm = T 
                                    ) / sum( (res_week3_075$Outcome == 1))
}



### Precision and recall ### 
pr.data <- rbind(pr.data_MGPD, pr.data_LOGIT)

y.plot <-
  ggplot(data = pr.data, aes(
    x = Recall,
    y = Precision,
    colour = Model,
    linetype = Model
  )) + geom_path()
y.plot <- y.plot + geom_hline(yintercept = sum( (res_week3_075$Outcome == 1))/length(res_week3_075$Outcome))

y.plot <- y.plot + xlim(c(0, 1)) + ylim(c(0, 1))
y.plot <- y.plot + scale_color_manual(values = c("blue", "red"))
y.plot <- y.plot + xlab("Recall") + ylab("Precision")
y.plot <- y.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
y.plot

## AUC PR ###
auc_pr_GP_075 <- PRAUC(res_week3_075$Predict_Gumbel_075, res_week3_075$Outcome)
auc_pr_Logit_075 <- PRAUC(res_week3_075[!is.na(res_week3_075$Predict_Logit_075),]$Predict_Logit_075,res_week3_075[!is.na(res_week3_075$Predict_Logit_075),]$Outcome)



##  Average precision score ###
N <- length(pr.data_MGPD$Recall)
AP_GP_075 <- sum(pr.data_MGPD$Precision*(pr.data_MGPD$Recall - c(0,pr.data_MGPD$Recall[-N])), na.rm = T)
AP_LOGIT_075 <- sum(pr.data_LOGIT$Precision*(pr.data_LOGIT$Recall - c(0,pr.data_LOGIT$Recall[-N])), na.rm = T)

Threshold = 0.95 \(\times\) max = 1,642

dis.thres_MGPD <- sort(res_week3_095$Predict_Gumbel_095, decreasing = TRUE)
pr.data_MGPD <-
  data.frame(DisThres = c(dis.thres_MGPD),
             Model = rep("Gumbel", length(dis.thres_MGPD)))

for (i in 1:length(dis.thres_MGPD)) {
  
  pr.data_MGPD$Precision[i] <- sum((res_week3_095$Outcome == 1) &
                                      (res_week3_095$Predict_Gumbel_095 > dis.thres_MGPD[i]) , na.rm = T 
                                    ) / sum((res_week3_095$Predict_Gumbel_095 > dis.thres_MGPD[i]) )
  pr.data_MGPD$Recall[i] <- sum((res_week3_095$Outcome == 1) &
                                      (res_week3_095$Predict_Gumbel_095 > dis.thres_MGPD[i]), na.rm = T 
                                    ) / sum( (res_week3_095$Outcome == 1))
}

### Precision and recall ### 
pr.data <- rbind(pr.data_MGPD)

y.plot <-
  ggplot(data = pr.data, aes(
    x = Recall,
    y = Precision,
    colour = Model,
    linetype = Model
  )) + geom_path()
y.plot <- y.plot + geom_hline(yintercept = sum( (res_week3_095$Outcome == 1))/length(res_week3_095$Outcome))

y.plot <- y.plot + xlim(c(0, 1)) + ylim(c(0, 1))
y.plot <- y.plot + scale_color_manual(values = c("blue"))
y.plot <- y.plot + xlab("Recall") + ylab("Precision")
y.plot <- y.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
y.plot

## AUC PR ###
auc_pr_GP_095 <- PRAUC(res_week3_095$Predict_Gumbel_095, res_week3_095$Outcome)



##  Average precision score ###
N <- length(pr.data_MGPD$Recall)
AP_GP_095 <- sum(pr.data_MGPD$Precision*(pr.data_MGPD$Recall - c(0,pr.data_MGPD$Recall[-N])), na.rm = T)

Threshold = 1 \(\times\) max = 1,729

dis.thres_MGPD <- sort(res_week3_1$Predict_Gumbel_1, decreasing = TRUE)
pr.data_MGPD <-
  data.frame(DisThres = c(dis.thres_MGPD),
             Model = rep("Gumbel", length(dis.thres_MGPD)))



for (i in 1:length(dis.thres_MGPD)) {
  pr.data_MGPD$Precision[i] <- sum((res_week3_1$Outcome == 1) &
                                      (res_week3_1$Predict_Gumbel_1 > dis.thres_MGPD[i]) , na.rm = T 
                                    ) / sum((res_week3_1$Predict_Gumbel_1 > dis.thres_MGPD[i]) )
  pr.data_MGPD$Recall[i] <- sum((res_week3_1$Outcome == 1) &
                                      (res_week3_1$Predict_Gumbel_1 > dis.thres_MGPD[i]), na.rm = T 
                                    ) / sum( (res_week3_1$Outcome == 1))
}

### Precision and recall ### 
pr.data <- rbind(pr.data_MGPD)

y.plot <-
  ggplot(data = pr.data, aes(
    x = Recall,
    y = Precision,
    colour = Model,
    linetype = Model
  )) + geom_path()
y.plot <- y.plot + geom_hline(yintercept = sum( (res_week3_1$Outcome == 1))/length(res_week3_1$Outcome))

y.plot <- y.plot + xlim(c(0, 1)) + ylim(c(0, 1))
y.plot <- y.plot + scale_color_manual(values = c("blue"))
y.plot <- y.plot + xlab("Recall") + ylab("Precision")
y.plot <- y.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  plot.title = element_text(size = 18)
)
y.plot

## AUC PR ###
auc_pr_GP_1 <- PRAUC(res_week3_1$Predict_Gumbel_1, res_week3_1$Outcome)



##  Average precision score ###
N <- length(pr.data_MGPD$Recall)
AP_GP_1 <- sum(pr.data_MGPD$Precision*(pr.data_MGPD$Recall - c(0,pr.data_MGPD$Recall[-N])), na.rm = T)

Threshold = 1.2 \(\times\) max = 2,075

Cannot be computed because there is no exceedance of this threshold.

Table of AUC for PRC

auc_prc <- data.frame(matrix(c(auc_pr_GP_05, auc_pr_GP_075, auc_pr_GP_095, auc_pr_GP_1, NA, auc_pr_Logit_05, auc_pr_Logit_075, NA, NA, NA), byrow = T, ncol = 5))
 rownames(auc_prc) <- c( "GP", "Logistic")
colnames(auc_prc) <- c(0.5, 0.75, 0.95, 1, 1.2)
kable(auc_prc) 
0.5 0.75 0.95 1 1.2
GP 0.5537879 0.5694444 0.3333333 0.25 NA
Logistic 0.2428498 0.1811975 NA NA NA

Table of Average Precision Score

aprec_score <- data.frame(matrix(c(AP_GP_05, AP_GP_075, AP_GP_095, AP_GP_1, NA, AP_LOGIT_05, AP_LOGIT_075, NA, NA, NA), byrow = T, ncol = 5)) 
rownames(aprec_score) <- c( "GP", "Logistic") 
colnames(aprec_score) <- c(0.5, 0.75, 0.95, 1, 1.2) 
kable(aprec_score)
0.5 0.75 0.95 1 1.2
GP 0.7742424 0.9166667 0.5 0.5 NA
Logistic 0.4775092 0.2651727 NA NA NA

Section 4.3

anomaly.df <- data.frame(season = unique(epid_data_8519$season))
anomaly.df$LLK <- -log(res_week3_LOO$LLK)
anomaly.df <- anomaly.df[-c(30,32),]
## Get simulations
simul.dataU_Gumbel <-
  read.csv("SimulatedDataWeek3.csv",
    row.names = 1
  )
simulGumbel_W3 <- simul.dataU_Gumbel[simul.dataU_Gumbel$weeks == 3, ]
q99 <- quantile(simulGumbel_W3$excess, probs = 0.99)
index1 <- which(simulGumbel_W3$excess > q99-1e-5)
temp <- simulGumbel_W3[index1, ]
index2 <- which(temp[temp$weeks == 3,]$excess < q99+1e-5)
extreme.point.season <- temp[index2, ]$season
extreme.point.df <-
  simul.dataU_Gumbel[simul.dataU_Gumbel$season == extreme.point.season,]
extreme.point.vec <- as.vector(extreme.point.df$excess)
LLK.extreme <-
  DensityGumbelMGPDstand_U(x = extreme.point.vec, alpha = est.alpha_M1, beta = est.beta_M1)
extreme.point.df$LLK <- -log(LLK.extreme)
abnormal.point.vec <-
  c(extreme.point.vec[1] * 1.5,
    extreme.point.vec[2] * 0.5,
    extreme.point.vec[3] * 1.5)
abnormal.point.df <-
  data.frame(season = rep(1, 3),
             weeks = 1:3,
             excess = abnormal.point.vec)

LLK.abnormal <-
  DensityGumbelMGPDstand_U(x = abnormal.point.vec, alpha = est.alpha_M1, beta = est.beta_M1)
abnormal.point.df$LLK <- -log(LLK.abnormal)
swine.point.df <- data.frame(
  season = 2010,
  LLK = anomaly.df[anomaly.df$season == 2010,]$LLK
)

Table 4

res_week3_simus <-
  read.csv(file = "PredSimus_Week3.csv", row.names = 1)

LLKsimus <- -log(res_week3_simus$LLK)
levels.10 <- quantile(LLKsimus, probs = 0.9)
levels.5 <- quantile(LLKsimus, probs = 0.95)
levels.1<- quantile(LLKsimus, probs = 0.99)
levels.01 <- quantile(LLKsimus, probs = 0.999)

tab4 <- data.frame(t(c(levels.10, levels.5,levels.1, levels.01)))
rownames(tab4) <- c("level")
colnames(tab4) <- c("10%", "5%", "1%", "0.1%")
kable(tab4)
10% 5% 1% 0.1%
level 4.720036 5.603064 7.78854 14.50502

Figure 3

x.plot <-
  ggplot(data = anomaly.df, aes(x = season, y = LLK))
x.plot <-
  x.plot + geom_point(shape = 1)
x.plot <-
  x.plot + geom_point(data = extreme.point.df, aes(2001, LLK), shape = 15, size = 2)#square
x.plot <-
  x.plot + geom_point(data = abnormal.point.df, aes(2015, LLK), shape = 17, size = 2) #triangle
x.plot <-
  x.plot + geom_point(data = swine.point.df, aes(2010, LLK), shape = 19, size = 2)#circle
x.plot <-
  x.plot + geom_hline(yintercept = levels.1,
                      linetype = 2,
                      color = "blue")
x.plot <-
  x.plot + geom_hline(yintercept = levels.01,
                      linetype = 3,
                      color = "blue")
x.plot <-
  x.plot + xlab("Season") + ylab("Negative log-likelihood")
x.plot <- x.plot + theme(
  panel.grid.minor = element_blank(),
  text = element_text(size = 16),
  axis.text.x = element_text(size = 12),
  axis.text.y = element_text(size = 12),
  plot.title = element_text(size = 18)
)
x.plot